load(file = "../output/mediatenorCOLLAPSEDDF.Rda")
df.collapsed[1:20,]## Warning in format.POSIXlt(as.POSIXlt(x), ...): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Europe/Berlin'
## # A tibble: 20 x 6
## date medium p_group category obs wertung
## <date> <fct> <fct> <chr> <int> <dbl>
## 1 1998-02-01 heute CDU/CSU news_tv 2 0
## 2 1998-02-01 heute SPD news_tv 2 -0.500
## 3 1998-02-01 heute journal CDU/CSU news_tv 3 0
## 4 1998-02-01 heute journal SPD news_tv 1 0
## 5 1998-02-01 RTL Aktuell CDU/CSU news_tv 1 0
## 6 1998-02-01 RTL Aktuell SPD news_tv 1 0
## 7 1998-02-01 Sat.1 News SPD news_tv 15 0
## 8 1998-02-02 Berliner Bündnis 90/ Die Grüne daily_pri… 9 -0.222
## 9 1998-02-02 Berliner CDU/CSU daily_pri… 13 0
## 10 1998-02-02 Berliner FDP daily_pri… 4 0
## 11 1998-02-02 Berliner Linke/PDS/WASG daily_pri… 8 0.250
## 12 1998-02-02 Berliner SPD daily_pri… 91 0.0110
## 13 1998-02-02 Bild Bündnis 90/ Die Grüne daily_pri… 1 0
## 14 1998-02-02 Bild CDU/CSU daily_pri… 1 0
## 15 1998-02-02 Bild FDP daily_pri… 4 0
## 16 1998-02-02 Bild SPD daily_pri… 18 0
## 17 1998-02-02 Die Welt Bündnis 90/ Die Grüne daily_pri… 11 0
## 18 1998-02-02 Die Welt CDU/CSU daily_pri… 19 -0.158
## 19 1998-02-02 Die Welt FDP daily_pri… 1 -1.00
## 20 1998-02-02 Die Welt Linke/PDS/WASG daily_pri… 3 -0.667
df.collapsed %>%
filter(category == "daily_print") %>%
ggplot(aes(date, wertung, color = p_group)) +
geom_col() +
facet_grid(medium~p_group) +
labs(x="", y="", title = "Tageszeitungen") +
theme(legend.position = "none")## Warning: Removed 2 rows containing missing values (position_stack).
df.collapsed %>%
filter(category == "news_tv") %>%
ggplot(aes(date, wertung, color = p_group)) +
geom_col() +
facet_grid(medium~p_group) +
labs(x="", y="", title = "Nachrichtensendungen") +
theme(legend.position = "none")df.collapsed %>%
filter(category == "magazine_print") %>%
ggplot(aes(date, wertung, color = p_group)) +
geom_col() +
facet_grid(medium~p_group) +
labs(x="", y="", title = "Magazine und Wochenzeitungen") +
theme(legend.position = "none")df.collapsed %>%
filter(category == "polit_tv") %>%
ggplot(aes(date, wertung, color = p_group)) +
geom_col() +
facet_grid(medium~p_group) +
labs(x="", y="", title = "Politsendungen") +
theme(legend.position = "none")Clustering is a broad set of techniques for finding subgroups of observations within a data set. When we cluster observations, we want observations in the same group to be similar and observations in different groups to be dissimilar. Because there isn’t a response variable, this is an unsupervised method, which implies that it seeks to find relationships between the n observations without being trained by a response variable. Clustering allows us to identify which observations are alike, and potentially categorize them therein. K-means clustering is the simplest and the most commonly used clustering method for splitting a dataset into a set of k groups.
To perform a cluster analysis in R, generally, the data should be prepared as follows:
Rows are observations (individuals) and columns are variables.
Any missing value in the data must be removed or estimated.
The data must be standardized (i.e., scaled) to make variables comparable.
I group the data by medium & year to compute the mean value for each p_group:
df.cluster <-
df.collapsed %>%
filter(!p_group %in% c("Andere Parteien",
"Rechtsextreme Parteien",
"Linke/PDS/WASG")) %>%
mutate(
#month = week(date),
year = year(date)
# yearweek = paste(as.character(year),
# as.character(week)), sep="-"
) %>%
group_by(medium, p_group, year) %>%
summarise(wertung = mean(wertung, na.rm=T)) %>%
ungroup() %>%
spread(p_group, wertung) %>%
mutate(id = paste(year, medium, sep = "/"))
m.cluster <- as.matrix(df.cluster %>%
select(- c(medium, year, id)))
row.names(m.cluster) <- df.cluster$id
head(m.cluster)## Bündnis 90/ Die Grüne CDU/CSU FDP SPD
## 2001/BamS -0.14696045 -0.0330213065 -0.004668445 -0.09314338
## 2002/BamS -0.13496912 -0.0035470546 -0.021011295 -0.11205892
## 2003/BamS -0.09701104 -0.0008290094 -0.007665907 -0.13620967
## 2004/BamS -0.02201161 -0.0234293898 -0.036320782 -0.06115106
## 2005/BamS -0.16178368 0.0031316861 0.072047652 -0.06543663
## 2006/BamS -0.10737886 -0.0580219727 -0.032191638 -0.11190067
m.cluster <- na.omit(m.cluster)K-means clustering is the most commonly used unsupervised machine learning algorithm for partitioning a given data set into a set of k groups (i.e. k clusters), where k represents the number of groups pre-specified by the analyst. It classifies objects in multiple groups (i.e., clusters), such that objects within the same cluster are as similar as possible (i.e., high intra-class similarity), whereas objects from different clusters are as dissimilar as possible (i.e., low inter-class similarity). In k-means clustering, each cluster is represented by its center (i.e, centroid) which corresponds to the mean of points assigned to the cluster.
k2 <- kmeans(m.cluster, centers = 2, nstart = 25)
k3 <- kmeans(m.cluster, centers = 3, nstart = 25)
k4 <- kmeans(m.cluster, centers = 4, nstart = 25)
k5 <- kmeans(m.cluster, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, data = m.cluster) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, data = m.cluster) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, data = m.cluster) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, data = m.cluster) + ggtitle("k = 5")
library(gridExtra)##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Hmisc':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, p4, nrow = 2)